02 re_demo

re

import requests
from fake_useragent import UserAgent
import re

url = 'http://c.biancheng.net/python_spider/re-module.html'
headers = {
    "User-Agent": UserAgent().random
}
# 构造请求
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
info = response.text
# print(info)

# 寻找HTML规律，书写正则表达式，使用正则表达式分组提取信息
pattern = re.compile(r'<div id="contents">.*<div id="article-wrap">', re.S)
r_content = pattern.findall(info)[0]

# <a href="/python_spider/what-is-spider.html">网络爬虫是什么</a>
pattern = re.compile(r'<a href="(?P<href>.*?)">(?P<title>.*?)</a>', re.S)
r_list = pattern.finditer(r_content)
# 整理数据格式并输出
if r_list:
    for r_info in r_list:
        if ".html" not in r_info.group('href'):
            continue
        print("href：", r_info.group('href'))
        print("title：", r_info.group('title'))
        print(20 * "*")

out:

href： /python_spider/what-is-spider.html
title： 网络爬虫是什么
********************
href： /python_spider/webpage.html
title： 网页构成
********************